import sys
import os
import logging

from paddleocr import PaddleOCR
from pdf2image import convert_from_path
# 设置日志级别
logging.getLogger("ppocr").setLevel(logging.WARNING)

def pdf_to_text(pdf_path):
    # 将 PDF 转换为图片
    images = convert_from_path(pdf_path)

    # 初始化 PaddleOCR
    ocr = PaddleOCR(use_angle_cls=True, lang='ch')  # 'ch' 为中文，改为 'en' 适用于英文

    # 提取文本
    extracted_text = []
    for image in images:
        # 将每页保存为临时文件
        image_file = 'temp_image.jpg'
        image.save(image_file, 'JPEG')

        result = ocr.ocr(image_file, cls=True)

        # 收集结果
        for line in result:
            for word_info in line:
                text = word_info[1][0]
                extracted_text.append(text)

        # 删除临时图片文件
        os.remove(image_file)

    return "\n".join(extracted_text)

if __name__ == "__main__":
    pdf_path = sys.argv[1]
    text = pdf_to_text(pdf_path)
    print(text)
